Load libraries

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.1     v purrr   0.3.4
## v tibble  3.0.1     v dplyr   1.0.5
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(downloader)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
library(dunn.test)
library(broom)

1) Read in the gapminder_clean.csv data as a tibble using read_csv

Download gapminder_clean.csv from Github/Bioinformatics-Research-Network

url <- "https://raw.githubusercontent.com/Bioinformatics-Research-Network/training-requirements/main/R%20for%20Data%20Science/gapminder_clean.csv"
filename <- basename(url)
if(!file.exists(filename)) download(url, destfile=filename)

Read in gapminder_clean.csv as a tibble using read_csv

dat <- read.csv(filename)
dim(dat)
## [1] 2607   20

The gapminder data consists of a matrix with 2607 rows and 20 columns.

2) Filter the data to include only rows where Year is 1962 and then make a scatter plot comparing ‘CO2 emissions (metric tons per capita)’ and gdpPercap for the filtered data

dat %>% 
  filter(Year == 1962) %>%
  # keep only relevant columns
  transmute(gdpPercap=gdpPercap,CO2.emissions..metric.tons.per.capita.= CO2.emissions..metric.tons.per.capita.) %>%
  # remove rows which contain NAs
  drop_na() %>%
  ggplot(aes(x = gdpPercap, y = CO2.emissions..metric.tons.per.capita.)) +
  geom_point() +
  ylab("CO2 emissions (metric tons per capita)")

# Because of one very large outlier the analysis repreated with log transformed data

dat %>% 
  filter(Year == 1962) %>%
  # remove rows which contain NAs
  filter_at(vars(CO2.emissions..metric.tons.per.capita., gdpPercap), all_vars(!is.na(.))) %>%
  ggplot(aes(x = log(gdpPercap), y = log(CO2.emissions..metric.tons.per.capita.))) +
  geom_point() +
  ylab("log(CO2 emissions (metric tons per capita))")

Log transformed data can be used to make data as “normal” as possible so that the statistical analysis results become more valid.

3) On the filtered data, calculate the pearson correlation of ‘CO2 emissions (metric tons per capita)’ and gdpPercap. What is the Pearson R value and associated p value?

dat %>% 
  filter(Year == 1962) %>%
  # remove rows which contain NAs
  filter_at(vars(CO2.emissions..metric.tons.per.capita., gdpPercap), all_vars(!is.na(.))) %>%
  pivot_longer( gdpPercap, names_to = "x_var", values_to = "x_val") %>%
  pivot_longer( CO2.emissions..metric.tons.per.capita., names_to = "y_var", values_to = "y_val") %>%
  group_by(x_var, y_var) %>%
   summarise(cor_coef = cor.test(x_val, y_val)$estimate,
            p_val = cor.test(x_val, y_val)$p.value)
## `summarise()` has grouped output by 'x_var'. You can override using the `.groups` argument.
## # A tibble: 1 x 4
## # Groups:   x_var [1]
##   x_var     y_var                                  cor_coef    p_val
##   <chr>     <chr>                                     <dbl>    <dbl>
## 1 gdpPercap CO2.emissions..metric.tons.per.capita.    0.926 1.13e-46

The Pearson R value of ‘CO2 emissions (metric tons per capita)’ and gdpPercap in Year 1962 is 0.9260817 and the associated p value is 1.128679e-46.

# Because of one very large outlier the analysis repreated with log transformed data

dat %>% 
  filter(Year == 1962) %>%
  # remove rows which contain NAs
  filter_at(vars(CO2.emissions..metric.tons.per.capita., gdpPercap), all_vars(!is.na(.))) %>%
  pivot_longer( gdpPercap, names_to = "x_var", values_to = "x_val") %>%
  pivot_longer( CO2.emissions..metric.tons.per.capita., names_to = "y_var", values_to = "y_val") %>%
  group_by(x_var, y_var) %>%
  # calculate pearson R value and associated p value
  summarise(cor_coef = cor.test(log(x_val), log(y_val))$estimate,
            p_val = cor.test(log(x_val), log(y_val))$p.value)
## `summarise()` has grouped output by 'x_var'. You can override using the `.groups` argument.
## # A tibble: 1 x 4
## # Groups:   x_var [1]
##   x_var     y_var                                  cor_coef    p_val
##   <chr>     <chr>                                     <dbl>    <dbl>
## 1 gdpPercap CO2.emissions..metric.tons.per.capita.    0.860 8.90e-33

The Pearson R value of log transformed ‘CO2 emissions (metric tons per capita)’ and gdpPercap in Year 1962 is 0.8602081 and the associated p value is 8.903567e-33.

4) On the unfiltered data, answer “In what year is the correlation between ‘CO2 emissions (metric tons per capita)’ and gdpPercap the strongest?” Filter the dataset to that year for the next step…

year.cor.max <- dat %>%
  # remove rows which contain NAs
  filter_at(vars(CO2.emissions..metric.tons.per.capita., gdpPercap), all_vars(!is.na(.))) %>%
  # split origial tibble in lists depending on the year
  split(.$Year) %>%
  # perform pearson correlation test on each year list, keep only the pearson R value
  map(~cor.test(log(.$CO2.emissions..metric.tons.per.capita.), log(.$gdpPercap))$estimate) %>% 
  # merge list of lists to one final list
  unlist

year.cor.max
##  1962.cor  1967.cor  1972.cor  1977.cor  1982.cor  1987.cor  1992.cor  1997.cor 
## 0.8602081 0.8736773 0.8843802 0.9014278 0.9151610 0.9080487 0.8934602 0.9208183 
##  2002.cor  2007.cor 
## 0.9300017 0.9275425
year.cor.max[which.max(year.cor.max)]
##  2002.cor 
## 0.9300017

In year 2002 is the correlation between ‘CO2 emissions (metric tons per capita)’ and gdpPercap the strongest.

5) Using plotly, create an interactive scatter plot comparing ‘CO2 emissions (metric tons per capita)’ and gdpPercap, where the point size is determined by pop (population) and the color is determined by the continent. You can easily convert any ggplot plot to a plotly plot using the ggplotly() command.

viz <- dat %>% 
    filter(Year == 2002) %>%
    # remove rows which contain NAs
    filter_at(vars(gdpPercap, CO2.emissions..metric.tons.per.capita.), all_vars(!is.na(.))) %>%
    ggplot(aes(x = log(gdpPercap), y = log(CO2.emissions..metric.tons.per.capita.), text=paste0('</br>log(gdpPercap): ', gdpPercap, '</br>log(CO2 emissions (metric tons per capita)): ', CO2.emissions..metric.tons.per.capita., '</br>population: ', pop, '</br>continent: ', continent, '</br>Country Name: ', Country.Name))) +
    geom_point(aes(size = pop, color = continent)) +
    ylab("log(CO2 emissions (metric tons per capita))")
  
ggplotly(viz, tooltip = c("text"))

6) What is the relationship between continent and ‘Energy use (kg of oil equivalent per capita)’?

We choose a suitable test based on the ‘Flow Chart for Selecting Commonly Used Statistical Tests’. We have continuous data and want to check for differences in mean for more than two groups. 

Therefore, we have to check if the parametric assumptions are satisfied. For ANOVA, we have four parametric assumptions that must be met:

6.1) Samples must be independent

dat %>%
  filter(continent != "") %>%
  # remove rows which contain NAs
  filter_at(vars(Energy.use..kg.of.oil.equivalent.per.capita.), all_vars(!is.na(.))) %>%
  ggplot(aes(x=continent, y=Energy.use..kg.of.oil.equivalent.per.capita., fill = continent)) +
  geom_boxplot() +
  ylab("Energy use (kg of oil equivalent per capita)")

First, exploratory data analysis (EDA) was performed on the log transformed data. We can say that the samples are independent.

6.2) Population variances must be equal (Levene’s test)

dat %>%
  filter(continent != "") %>%
  # remove rows which contain NAs
  filter_at(vars(Energy.use..kg.of.oil.equivalent.per.capita.), all_vars(!is.na(.))) %>%
  # perform Levene’s test
  do(tidy(leveneTest(.$Energy.use..kg.of.oil.equivalent.per.capita.~.$continent, data = .)))
## # A tibble: 1 x 4
##   statistic  p.value    df df.residual
##       <dbl>    <dbl> <int>       <int>
## 1      12.4 8.00e-10     4         843

We reject the H0 hypothesis that the variances are equal based on the p value of our Levene’s test. Therefore, we can not use an ANOVA test and must use a Kruskal-Wallis test instead.

6.3) Kruskal-Wallis test

dat %>%
  filter(continent != "") %>%
  # remove rows which contain NAs
  filter_at(vars(Energy.use..kg.of.oil.equivalent.per.capita.), all_vars(!is.na(.))) %>%
  # perform Kruskal-Wallis test
  do(tidy(kruskal.test(.$Energy.use..kg.of.oil.equivalent.per.capita.~.$continent, data = .)))
## # A tibble: 1 x 4
##   statistic  p.value parameter method                      
##       <dbl>    <dbl>     <int> <chr>                       
## 1      319. 1.01e-67         4 Kruskal-Wallis rank sum test

The resulting p value of our Kruskal-Wallis test shows that there is a significant difference in the mean for our data. Because of our significant result, we additionally perform a dunn’s test.

dat6 <- dat %>%
  filter(continent != "") %>%
  # remove rows which contain NAs
  filter_at(vars(Energy.use..kg.of.oil.equivalent.per.capita.), all_vars(!is.na(.)))

# perform dunn's test
dunn.test(dat6$Energy.use..kg.of.oil.equivalent.per.capita., g=dat6$continent)
##   Kruskal-Wallis rank sum test
## 
## data: x and group
## Kruskal-Wallis chi-squared = 318.6763, df = 4, p-value = 0
## 
## 
##                            Comparison of x by group                            
##                                 (No adjustment)                                
## Col Mean-|
## Row Mean |     Africa   Americas       Asia     Europe
## ---------+--------------------------------------------
## Americas |  -6.174961
##          |    0.0000*
##          |
##     Asia |  -4.852592   1.278883
##          |    0.0000*     0.1005
##          |
##   Europe |  -16.43361  -9.630909  -10.95868
##          |    0.0000*    0.0000*    0.0000*
##          |
##  Oceania |  -8.148201  -5.456302  -6.014711  -1.543152
##          |    0.0000*    0.0000*    0.0000*     0.0614
## 
## alpha = 0.05
## Reject Ho if p <= alpha/2

7) Is there a significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990?

Again, we choose a suitable test based on the ‘Flow Chart for Selecting Commonly Used Statistical Tests’. We have continuous data and want to check for differences in mean for two groups. 

Therefore, we have to check if the parametric assumptions are satisfied. For Student’s unpaired t-test, we have four parametric assumptions that must be met:

7.1) The observations are sampled independently

dat %>%
  filter(Year > 1990 & (continent == "Europe" | continent == "Asia")) %>%
  # remove rows which contain NAs
  filter_at(vars(Imports.of.goods.and.services....of.GDP.), all_vars(!is.na(.))) %>%
  ggplot(aes(x=continent, y=Imports.of.goods.and.services....of.GDP., fill = continent)) +
  geom_boxplot() +
  ylab("Imports of goods and services (% of GDP)")

First, exploratory data analysis (EDA) was performed on the log transformed data. We can say that the samples are independent. We can also say that the dependent variable is measured on the incremental level year and that independent variables consist of matched pairs.

7.2) The dependent variable is normally distributed (Shapiro-Wilk normality test)

dat %>%
  filter(Year > 1990 & (continent == "Europe" | continent == "Asia")) %>%
  # remove rows which contain NAs
  filter_at(vars(Imports.of.goods.and.services....of.GDP.), all_vars(!is.na(.))) %>%
  # perform Shapiro-Wilk normality test
  do(tidy(shapiro.test(.$Imports.of.goods.and.services....of.GDP.)))
## # A tibble: 1 x 3
##   statistic  p.value method                     
##       <dbl>    <dbl> <chr>                      
## 1     0.849 1.46e-13 Shapiro-Wilk normality test

We reject the H0 hypothesis that the data is normally distributed based on the p values of our Shapiro-Wilk normality test. Therefore, we can not use a Student’s unpaired t-test and must use a Wilcoxon Rank sums test instead.

7.3) Wilcoxon Rank sums test

dat %>%
  filter(Year > 1990 & (continent == "Europe" | continent == "Asia")) %>%
  # remove rows which contain NAs
  filter_at(vars(Imports.of.goods.and.services....of.GDP.), all_vars(!is.na(.))) %>%
  # perform Wilcoxon Rank sums test
  do(tidy(wilcox.test(.$Imports.of.goods.and.services....of.GDP.~.$continent)))
## # A tibble: 1 x 4
##   statistic p.value method                                           alternative
##       <dbl>   <dbl> <chr>                                            <chr>      
## 1      5707   0.787 Wilcoxon rank sum test with continuity correcti~ two.sided

We reject the H0 hypothesis that there is a significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990 based on the p values of our Wilcoxon Rank sums test.

8) What is the country (or countries) that has the highest ‘Population density (people per sq. km of land area)’ across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)

First, exploratory data analysis (EDA) was performed on the log transformed data.

viz <- dat %>%
  # remove rows which contain NAs
  filter_at(vars(Year, Country.Name, Population.density..people.per.sq..km.of.land.area.), all_vars(!is.na(.))) %>%
  ggplot(aes(x=Year, y=Population.density..people.per.sq..km.of.land.area., group = Country.Name)) + 
  geom_line() + 
  ylab("Population density (people per sq. km of land area)")

ggplotly(viz)
highest.number <- dat %>%
  # remove rows which contain NAs
  filter_at(vars(Year, Country.Name, Population.density..people.per.sq..km.of.land.area.), all_vars(!is.na(.))) %>%
  # split origial tibble in lists depending on the year
  split(.$Year) %>%
  # get maximum of variable in every year
  map(~max(.$Population.density..people.per.sq..km.of.land.area.)) %>%
  # merge list of lists into one list
  unlist

# map highest scoring variable from every year to respective country
highest.ranked.countries <- data.frame(lapply(highest.number, function(x) droplevels(dat$Country.Name[which(dat$Population.density..people.per.sq..km.of.land.area. == x)])), stringsAsFactors=FALSE)

# count occurences of highest scoring countries
apply(highest.ranked.countries, MARGIN=1, table)
##                  [,1]
## Macao SAR, China    5
## Monaco              5

The countries that have the highest ‘Population density (people per sq. km of land area)’ across all years are Macao SAR and Monaco.

9) What country (or countries) has shown the greatest increase in ‘Life expectancy at birth, total (years)’ since 1962?

First, exploratory data analysis (EDA) was performed on the log transformed data.

viz <- dat %>%
  # remove rows which contain NAs
  filter_at(vars(Year, Country.Name, Life.expectancy.at.birth..total..years.), all_vars(!is.na(.))) %>%
  ggplot(aes(x=Year, y=Life.expectancy.at.birth..total..years., group = Country.Name)) + 
  geom_line() + 
  ylab("Life expectancy at birth, total (years)")

ggplotly(viz)

Get difference for Life expectancy at birth, total (years) for every country between 1962 and 2007.

dat %>%
  filter(Year == 1962 | Year == 2007) %>%
  # remove rows which contain NAs
  filter_at(vars(Year, Country.Name, Life.expectancy.at.birth..total..years.), all_vars(!is.na(.))) %>%
  group_by(Country.Name) %>%
  # Calculate difference between 1962 and 2007
  mutate(diff = Life.expectancy.at.birth..total..years. - lag(Life.expectancy.at.birth..total..years., default = 0)) %>% 
  select(Country.Name, diff) %>%
  # keep only rows with calculated difference
  filter(row_number() %% 2 == 0) %>%
  # sort by difference
  arrange(desc(diff))
## # A tibble: 236 x 2
## # Groups:   Country.Name [236]
##    Country.Name        diff
##    <fct>              <dbl>
##  1 Maldives            36.9
##  2 Bhutan              33.2
##  3 Timor-Leste         31.1
##  4 Tunisia             30.9
##  5 Oman                30.8
##  6 Nepal               30.6
##  7 China               29.9
##  8 Yemen, Rep.         27.2
##  9 Saudi Arabia        26.7
## 10 Iran, Islamic Rep.  26.6
## # ... with 226 more rows

Country Maldives has shown the greatest increase in ‘Life expectancy at birth, total (years)’ since 1962.